note: all colored graphs are interactive

data processing for annual holiday album ranked list

#column_classes = c("numeric","factor","factor","factor","factor","Date","numeric","numeric","numeric","numeric","numeric","numeric")
album_list_2015 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2015-Table 1.csv")
album_list_2015 = album_list_2015[,c(1:12)] #extra column was deleted
album_list_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2016-Table 1.csv")
album_list_2016 = album_list_2016[,c(1:12)] #extra column was deleted
album_list_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2017-Table 1.csv")
album_list_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Annual Holiday Albums - Ranked List 2015-2018/2018-Table 1.csv")

#convert columns to correct data types
album_list_2015 = album_list_2015 %>% 
  convert(num(Rank),fct(Title,Artist,Label,Core.Genre))

album_list_2015 = album_list_2015 %>% mutate(Release.Date = as.Date(Release.Date,
                                                  format = "%m/%d/%Y"))
#use gsub to get rid of commas in numeric values 
album_list_2015 = album_list_2015 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
                                             Albums.Sales...YTD  = as.numeric(gsub(",", "",Albums.Sales...YTD)),
                                             Physical.Albums.Sales...YTD  = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
                                             Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
                                             Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
                                             Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
                                             holiday.year = rep(2015,dim(album_list_2015)[1]),
                                             release.year = year(Release.Date)
                                             )

album_list_2016 = album_list_2016 %>% 
  convert(num(Rank),fct(Title,Artist,Label,Core.Genre))

album_list_2016 = album_list_2016 %>% mutate(Release.Date = as.Date(Release.Date,
                                                  format = "%m/%d/%Y"))
album_list_2016 = album_list_2016 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
                                             Albums.Sales...YTD  = as.numeric(gsub(",", "",Albums.Sales...YTD)),
                                             Physical.Albums.Sales...YTD  = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
                                             Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
                                             Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
                                             Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
                                             holiday.year = rep(2016,dim(album_list_2016)[1]),
                                             release.year = year(Release.Date)
                                             )

album_list_2017 = album_list_2017 %>% 
  convert(num(Rank),fct(Title,Artist,Label,Core.Genre))

album_list_2017 = album_list_2017 %>% mutate(Release.Date = as.Date(Release.Date,
                                                  format = "%m/%d/%Y"))
album_list_2017 = album_list_2017 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
                                             Albums.Sales...YTD  = as.numeric(gsub(",", "",Albums.Sales...YTD)),
                                             Physical.Albums.Sales...YTD  = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
                                             Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
                                             Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
                                             Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
                                             holiday.year = rep(2017,dim(album_list_2017)[1]),
                                             release.year = year(Release.Date)
                                             )

album_list_2018 = album_list_2018 %>% 
  convert(num(Rank),fct(Title,Artist,Label,Core.Genre))
## Warning in as_reliable_num(.): NAs introduced by coercion
album_list_2018 = album_list_2018 %>% mutate(Release.Date = as.Date(Release.Date,
                                                  format = "%m/%d/%Y"))
album_list_2018 = album_list_2018 %>% mutate(Albums.w.TEA.w.SEA.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)),
                                             Albums.Sales...YTD  = as.numeric(gsub(",", "",Albums.Sales...YTD)),
                                             Physical.Albums.Sales...YTD  = as.numeric(gsub(",", "",Physical.Albums.Sales...YTD)),
                                             Digital.Albums.Sales...YTD = as.numeric(gsub(",", "",Digital.Albums.Sales...YTD)),
                                             Digital.Song.Sales...YTD = as.numeric(gsub(",", "",Digital.Song.Sales...YTD)),
                                             Streaming.On.Demand.Audio...YTD = as.numeric(gsub(",", "",Streaming.On.Demand.Audio...YTD)),
                                             holiday.year = rep(2018,dim(album_list_2018)[1]),
                                             release.year = year(Release.Date)
                                             )

album_list_df = as.data.frame(rbind(album_list_2015,album_list_2016,album_list_2017,album_list_2018))

data processing for daily holiday data

daily_holiday_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2016.csv")
daily_holiday_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2017.csv")
daily_holiday_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2018.csv")
daily_holiday_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily holiday 2019.csv")


daily_holiday_2016 = daily_holiday_2016 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_holiday_2016 = daily_holiday_2016 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("holiday",dim(daily_holiday_2016)[1]) #binary variable to indicate not industry data
                                             )

daily_holiday_2017 = daily_holiday_2017 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_holiday_2017 = daily_holiday_2017 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("holiday",dim(daily_holiday_2017)[1]) #binary variable to indicate not industry data
                                             )

daily_holiday_2018 = daily_holiday_2018 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_holiday_2018 = daily_holiday_2018 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("holiday",dim(daily_holiday_2018)[1]) #binary variable to indicate not industry data
                                             )

daily_holiday_2019 = daily_holiday_2019 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_holiday_2019 = daily_holiday_2019 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("holiday",dim(daily_holiday_2019)[1]) #binary variable to indicate not industry data
                                             )

data processing for daily industry data

daily_industry_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2016.csv")
daily_industry_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2017.csv")
daily_industry_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2018.csv")
daily_industry_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/daily industry 2019.csv")

daily_industry_2016 = daily_industry_2016 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_industry_2016 = daily_industry_2016 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("industry",dim(daily_industry_2016)[1]) #binary variable to indicate it is industry data
                                             )

daily_industry_2017 = daily_industry_2017 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_industry_2017 = daily_industry_2017 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("industry",dim(daily_industry_2017)[1]) #binary variable to indicate it is industry data
                                             )

daily_industry_2018 = daily_industry_2018 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_industry_2018 = daily_industry_2018 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("industry",dim(daily_industry_2018)[1]) #binary variable to indicate it is industry data)
                                             )

daily_industry_2019 = daily_industry_2019 %>% mutate(Date = as.Date(Date,
                                                  format = "%m/%d/%Y"))
daily_industry_2019 = daily_industry_2019 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             day = day(Date),
                                             year = year(Date),
                                             industry = rep("industry",dim(daily_industry_2019)[1]) #binary variable to indicate it is industry data
                                             )
daily_holiday_industry = as.data.frame(rbind(daily_holiday_2016,daily_holiday_2017,daily_holiday_2018,daily_holiday_2019,daily_industry_2016,daily_industry_2017,daily_industry_2018,daily_industry_2019))

data processing for weekly holiday data

weekly_holiday_2015 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2015.csv")
weekly_holiday_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2016.csv")
weekly_holiday_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2017.csv")
weekly_holiday_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2018.csv")
weekly_holiday_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly holiday 2019.csv")

weekly_holiday_2015 = weekly_holiday_2015 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("holiday",dim(weekly_holiday_2015)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_holiday_2015)[1]),
                                             year = rep(2015,dim(weekly_holiday_2015)[1])
                                             )

weekly_holiday_2016 = weekly_holiday_2016 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("holiday",dim(weekly_holiday_2016)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_holiday_2016)[1]),
                                             year = rep(2016,dim(weekly_holiday_2016)[1])
                                             )

weekly_holiday_2017 = weekly_holiday_2017 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("holiday",dim(weekly_holiday_2017)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_holiday_2017)[1]),
                                             year = rep(2017,dim(weekly_holiday_2017)[1])
                                             )

weekly_holiday_2018 = weekly_holiday_2018 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("holiday",dim(weekly_holiday_2018)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_holiday_2018)[1]),
                                             year = rep(2018,dim(weekly_holiday_2018)[1])
                                             )

weekly_holiday_2019 = weekly_holiday_2019 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("holiday",dim(weekly_holiday_2019)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_holiday_2019)[1]),
                                             year = rep(2019,dim(weekly_holiday_2019)[1])
                                             )

data processing for weekly industry

weekly_industry_2015 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2015.csv")
weekly_industry_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2016.csv")
weekly_industry_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2017.csv")
weekly_industry_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2018.csv")
weekly_industry_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/Holiday Trends/weekly industry 2019.csv")

weekly_industry_2015 = weekly_industry_2015 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("industry",dim(weekly_industry_2015)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_industry_2015)[1]),
                                             year = rep(2015,dim(weekly_industry_2015)[1])
                                             )

weekly_industry_2016 = weekly_industry_2016 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("industry",dim(weekly_industry_2016)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_industry_2016)[1]),
                                             year = rep(2016,dim(weekly_industry_2016)[1])
                                             )

weekly_industry_2017 = weekly_industry_2017 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("industry",dim(weekly_industry_2017)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_industry_2017)[1]),
                                             year = rep(2017,dim(weekly_industry_2017)[1])
                                             )

weekly_industry_2018 = weekly_industry_2018 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("industry",dim(weekly_industry_2018)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_industry_2018)[1]),
                                             year = rep(2018,dim(weekly_industry_2018)[1])
                                             )

weekly_industry_2019 = weekly_industry_2019 %>% mutate( 
                                             Albums.w.TEA.w.SEA.On.Demand = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand)),
                                             Albums.w.TEA.w.SEA.On.Demand.Audio = as.numeric(gsub(",", "",Albums.w.TEA.w.SEA.On.Demand.Audio)),
                                             Albums.w..TEA  = as.numeric(gsub(",", "",Albums.w..TEA)),
                                             Total.Album.Sales  = as.numeric(gsub(",", "",Total.Album.Sales)),
                                             Physical.Albums.Sales  = as.numeric(gsub(",", "",Physical.Albums.Sales)),
                                             Digital.Albums.Sales = as.numeric(gsub(",", "",Digital.Albums.Sales)),
                                             Digital.Song.Sales = as.numeric(gsub(",", "",Digital.Song.Sales)),
                                             Total.Streaming.On.Demand = as.numeric(gsub(",", "",Total.Streaming.On.Demand)),
                                             Streaming.On.Demand.Audio = as.numeric(gsub(",", "",Streaming.On.Demand.Audio)),
                                             Streaming.On.Demand.Video = as.numeric(gsub(",", "",Streaming.On.Demand.Video)),
                                             Total.Streaming.Programmed = as.numeric(gsub(",", "",Total.Streaming.Programmed)),
                                             Streaming.Programmed.Audio = as.numeric(gsub(",", "",Streaming.Programmed.Audio)),
                                             Streaming.Programmed.Video = as.numeric(gsub(",", "",Streaming.Programmed.Video)),
                                             Airplay.Spins = as.numeric(gsub(",", "",Airplay.Spins)),
                                             Airplay.Audience = as.numeric(gsub(",", "",Airplay.Audience)),
                                             industry = rep("industry",dim(weekly_industry_2019)[1]), #binary variable to indicate it is industry data,
                                             week = c(1:dim(weekly_industry_2019)[1]),
                                             year = rep(2019,dim(weekly_industry_2019)[1])
                                             )

weekly_holiday_industry = as.data.frame(rbind(weekly_holiday_2015,weekly_holiday_2016,weekly_holiday_2017,weekly_holiday_2018,weekly_holiday_2019,weekly_industry_2015,weekly_industry_2016,weekly_industry_2017,weekly_industry_2018,weekly_industry_2019))

data processing for song list

song_list_2016 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2016-Table 1.csv")
song_list_2017 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2017-Table 1.csv")
song_list_2018 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2018-Table 1.csv")
song_list_2019 = read.csv("/Users/tylerchiu/Downloads/mrc data project/HolidaySeasonal On-Demand Audio Streaming (YTD) Charts/2019-Table 1.csv")

song_list_2016 = song_list_2016 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))
song_list_2017 = song_list_2017 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))
song_list_2018 = song_list_2018 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))
song_list_2019 = song_list_2019 %>% mutate(YTD.Audio = as.numeric(gsub(",", "",YTD.Audio)))

song_list_df = as.data.frame(rbind(song_list_2016,song_list_2017,song_list_2018,song_list_2019))

exploratory analysis for album list

summary(album_list_2015)
##       Rank                            Title                          Artist   
##  Min.   :  1.00   Classic Christmas Album: 10   Bing Crosby             : 13  
##  1st Qu.: 50.75   Soundtrack             :  7   Frank Sinatra           : 13  
##  Median :100.50   Christmas Collection   :  6   Elvis Presley           : 11  
##  Mean   :100.50   Christmas Album        :  4   Mannheim Steamroller    :  8  
##  3rd Qu.:150.25   Home For Christmas     :  4   Trans-Siberian Orchestra:  7  
##  Max.   :200.00   Christmas              :  3   Andy Williams           :  5  
##                   (Other)                :166   (Other)                 :143  
##      Label                Core.Genre   Release.Date       
##  RCA    : 25   Children        :  1   Min.   :1977-09-09  
##  COL    : 20   Christian/Gospel:  1   1st Qu.:1998-07-12  
##  CAP    : 16   Country         :  2   Median :2006-09-27  
##         : 10   Holiday/Seasonal:194   Mean   :2004-09-11  
##  WAR    : 10   Pop             :  1   3rd Qu.:2012-10-05  
##  INT    :  9   Rock            :  1   Max.   :2016-11-25  
##  (Other):110                                              
##  Albums.w.TEA.w.SEA.On.Demand.Audio...YTD Albums.Sales...YTD
##  Min.   : 14035                           Min.   :     1    
##  1st Qu.: 17634                           1st Qu.:  4480    
##  Median : 25308                           Median : 15296    
##  Mean   : 32139                           Mean   : 19461    
##  3rd Qu.: 39546                           3rd Qu.: 27370    
##  Max.   :162621                           Max.   :154952    
##                                           NA's   :16        
##  Physical.Albums.Sales...YTD Digital.Albums.Sales...YTD
##  Min.   :     1              Min.   :    1.0           
##  1st Qu.:  3173              1st Qu.:  662.5           
##  Median : 13029              Median : 2382.0           
##  Mean   : 16757              Mean   : 4293.6           
##  3rd Qu.: 25406              3rd Qu.: 5358.5           
##  Max.   :120095              Max.   :34857.0           
##  NA's   :25                  NA's   :49                
##  Digital.Song.Sales...YTD Streaming.On.Demand.Audio...YTD  holiday.year 
##  Min.   :   970           Min.   :   50139                Min.   :2015  
##  1st Qu.: 16970           1st Qu.: 3158242                1st Qu.:2015  
##  Median : 47930           Median : 8576470                Median :2015  
##  Mean   : 65661           Mean   :12136070                Mean   :2015  
##  3rd Qu.: 82338           3rd Qu.:18731002                3rd Qu.:2015  
##  Max.   :547235           Max.   :75282837                Max.   :2015  
##  NA's   :3                NA's   :8                                     
##   release.year 
##  Min.   :1977  
##  1st Qu.:1998  
##  Median :2006  
##  Mean   :2004  
##  3rd Qu.:2012  
##  Max.   :2016  
## 
# summary(album_list_2015$Artist)
summary(album_list_2015$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14035   17634   25308   32139   39546  162621
# summary(album_list_2016$Artist)
# summary(album_list_2017$Artist)
# summary(album_list_2018$Artist)

plot(album_list_2015$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD~album_list_2015$Artist)

plot(album_list_2015$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD ~ album_list_2015$Core.Genre)

plot(album_list_2015$Streaming.On.Demand.Audio...YTD ~ album_list_2015$Core.Genre)

exploratory analysis on song list

song_list_df %>% group_by(Artist, Song) %>% arrange(Rank, desc(Weeks.On.Chart)) %>% distinct(Artist)
## # A tibble: 367 x 2
## # Groups:   Artist, Song [367]
##    Artist        Song                                       
##    <fct>         <fct>                                      
##  1 Mariah Carey  All I Want For Christmas Is You            
##  2 Brenda Lee    Rockin' Around the Christmas Tree          
##  3 Pentatonix    Hallelujah                                 
##  4 Bobby Helms   Jingle Bell Rock                           
##  5 Andy Williams It's The Most Wonderful Time Of The Year   
##  6 Burl Ives     Have A Holly Jolly Christmas               
##  7 Brenda Lee    Rockin' Around The Christmas Tree          
##  8 Burl Ives     A Holly Jolly Christmas                    
##  9 Michael Buble It's Beginning To Look A Lot Like Christmas
## 10 Wham!         Last Christmas                             
## # … with 357 more rows

terminology

album equiavalent unit: 1 album sale = 10 songs downloaded = 1500 streams tea = track equivalent album sea = streaming equivalent album on-demand (such as Amazon Music, Apple Music, Spotify and YouTube) programmed (such as Pandora and Slacker Radio)

Are there differences in consumption behaviors towards music from before and after 2004

# artists from before 2004
before_2004_index = ifelse(album_list_df$release.year<2004,1,0)

album_list_df$before_2004 = before_2004_index
album_list_df %>% group_by(Artist,release.year) %>%  summarise(avg = mean(Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>%  arrange(desc(avg)) 
## # A tibble: 303 x 3
## # Groups:   Artist [179]
##    Artist                   release.year   avg
##    <fct>                           <dbl> <dbl>
##  1 Chipmunks                        2001 1    
##  2 Countdown Kids                   2014 1    
##  3 Pentatonix                       2017 1    
##  4 Various Artists                  2013 1    
##  5 George Strait                    2016 1    
##  6 Irish Tenors                     2009 0.987
##  7 Joey + Rory                      2011 0.982
##  8 Randy Travis                     2007 0.969
##  9 Trans-Siberian Orchestra         2012 0.960
## 10 Gaither Vocal Band               2015 0.956
## # … with 293 more rows
# shows that despite older musicians from past eras have more works to be listened to, consumers are still listening to specific artists regardless of number of published works


album_list_df %>% group_by(Title,Artist,release.year) %>%  summarise(avg = mean(Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>%  arrange(desc(avg)) 
## # A tibble: 329 x 4
## # Groups:   Title, Artist [329]
##    Title                          Artist                   release.year   avg
##    <fct>                          <fct>                           <dbl> <dbl>
##  1 Christmas Sing-along           Various Artists                  2013 1    
##  2 Santa Claus Music Puzzle       Countdown Kids                   2014 1    
##  3 Vol. 1-christmas With The Chip Chipmunks                        2001 1    
##  4 Strait For The Holidays        George Strait                    2016 1    
##  5 That's Xmas To Me + Ptxmas Del Pentatonix                       2017 1    
##  6 Irish Tenors Christmas         Irish Tenors                     2009 0.987
##  7 A Farmhouse Christmas          Joey + Rory                      2011 0.982
##  8 Songs Of The Season            Randy Travis                     2007 0.969
##  9 Dreams Of Fireflies (on A Chri Trans-Siberian Orchestra         2012 0.960
## 10 Christmas Collection           Gaither Vocal Band               2015 0.956
## # … with 319 more rows
album_list_df %>% group_by(Title,Artist,release.year) %>%  summarise(avg = mean(Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>%  arrange(desc(avg))
## # A tibble: 329 x 4
## # Groups:   Title, Artist [329]
##    Title                    Artist          release.year   avg
##    <fct>                    <fct>                  <dbl> <dbl>
##  1 Christmas & Chill        Ariana Grande           2015 0.529
##  2 Braxton Family Christmas Braxtons                2015 0.510
##  3 December                 George Winston          1983 0.463
##  4 The Holiday Collection   Taylor Swift            2007 0.457
##  5 Simply Christmas         Leslie Odom Jr          2016 0.432
##  6 Merry Christmas          Johnny Mathis           1977 0.392
##  7 Home For Christmas       Amy Grant               1992 0.390
##  8 Hollens Family Christmas Peter Hollens           2016 0.382
##  9 Wintersong               Sarah McLachlan         2006 0.344
## 10 Christmas Kisses         Ariana Grande           2013 0.340
## # … with 319 more rows
# older artists are streamed more often
album_list_df %>% group_by(Title,Artist,release.year) %>%  summarise(avg = mean(Streaming.On.Demand.Audio...YTD /Albums.w.TEA.w.SEA.On.Demand.Audio...YTD)) %>%  arrange(desc(avg))
## # A tibble: 329 x 4
## # Groups:   Title, Artist [329]
##    Title                          Artist                    release.year   avg
##    <fct>                          <fct>                            <dbl> <dbl>
##  1 Winter Wonderland              Crosby, King                      2004 1388.
##  2 I Wish You A Merry Christmas ( Bing Crosby                       2011 1368.
##  3 Merry Christmas From Bing Cros Bing Crosby                       2009 1368.
##  4 10 Great Christmas Songs       Bing Crosby                       2012 1366.
##  5 I Wish You A Merry Christmas   Bing Crosby                       2001 1364.
##  6 Christmas Stars                Christmas Stars                   1991 1359.
##  7 Charlie Brown Christmas        Chestnut, Cyrus & Friends         2000 1356.
##  8 Bing Crosby's Christmas Classi Bing Crosby                       1999 1354.
##  9 Spirit Of Christmas            Spirit Of Christmas               1993 1349.
## 10 Complete Rca Christmas Col     Perry Como                        2013 1348.
## # … with 319 more rows
# even distribution of physical album sales across different release years, but there is an overall decrease in physical album sales
# aside from chipmunks that is from before 2004, the highest physical album sales are from albums after 2004
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Physical.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge') + ylab("Physical Album Sale Proporition") + xlab("Artists") + labs(fill="< 2004") +
  ggtitle("Distribution of Physical Album Sales between Old and New Holiday Music")  
ggplotly(g)
# albums produced after 2004 are digitally purchased more often, but there is a steady decline of digital ownership
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Digital.Albums.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge')  + xlab("Artists") + labs(fill="< 2004") + ylab("Digital Album Sale Proportion") +
  ggtitle("Distribution of Digital Album Sales between Old and New Holiday Music")  
ggplotly(g)
# individual digital song sales see an equal distribution of music from before and after 2004
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Digital.Song.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Digital.Song.Sales...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge')+ xlab("Artists") + labs(fill="< 2004") + ylab("Digital Song Sale Proportion") +
  ggtitle("Distribution of Digital Song Sales between Old and New Holiday Music")    
ggplotly(g)
# streaming on demand sees an equal distribution of music from before and after 2004
g = ggplot(data=album_list_df, aes(x=reorder(Artist,-Streaming.On.Demand.Audio...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD),y=Streaming.On.Demand.Audio...YTD/Albums.w.TEA.w.SEA.On.Demand.Audio...YTD,fill=as.factor(before_2004))) + geom_col(position='dodge')  + xlab("Artists") + labs(fill="< 2004") + ylab("On Demand Streaming Proportion") + ggtitle("Distribution of On Demand Streaming between Old and New Holiday Music")    
ggplotly(g)
before_2004_means = colMeans(album_list_df[album_list_df$before_2004==1,c(7:12)],na.rm = T)
after_2004_means = colMeans(album_list_df[album_list_df$before_2004==0,c(7:12)],na.rm = T)

# differences between music from before and after 2004
perc_difference_2004 = (after_2004_means-before_2004_means)/((before_2004_means+after_2004_means)/2) * 100 # percent difference
perc_difference_2004
## Albums.w.TEA.w.SEA.On.Demand.Audio...YTD 
##                                 3.371509 
##                       Albums.Sales...YTD 
##                                74.995332 
##              Physical.Albums.Sales...YTD 
##                                80.135553 
##               Digital.Albums.Sales...YTD 
##                                72.918539 
##                 Digital.Song.Sales...YTD 
##                                -4.755121 
##          Streaming.On.Demand.Audio...YTD 
##                               -18.861186
perc_change_2004 = (after_2004_means-before_2004_means)/before_2004_means * 100 # percent change
perc_change_2004
## Albums.w.TEA.w.SEA.On.Demand.Audio...YTD 
##                                 3.429319 
##                       Albums.Sales...YTD 
##                               119.988050 
##              Physical.Albums.Sales...YTD 
##                               133.710294 
##               Digital.Albums.Sales...YTD 
##                               114.758736 
##                 Digital.Song.Sales...YTD 
##                                -4.644691 
##          Streaming.On.Demand.Audio...YTD 
##                               -17.235753

t tests to test for significant difference in mean amongst music from before and after 2004

t.test(album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD[which(album_list_df$before_2004==0)],na.rm=T))
## 
##  One Sample t-test
## 
## data:  album_list_df$Albums.w.TEA.w.SEA.On.Demand.Audio...YTD[which(album_list_df$before_2004 ==     1)]
## t = -0.87817, df = 377, p-value = 0.3804
## alternative hypothesis: true mean is not equal to 62072.09
## 95 percent confidence interval:
##  55405.86 64622.17
## sample estimates:
## mean of x 
##  60014.02
t.test(album_list_df$Digital.Albums.Sales...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Digital.Albums.Sales...YTD[which(album_list_df$before_2004==0)],na.rm=T))
## 
##  One Sample t-test
## 
## data:  album_list_df$Digital.Albums.Sales...YTD[which(album_list_df$before_2004 ==     1)]
## t = -11.432, df = 242, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 5962.03
## 95 percent confidence interval:
##  2227.208 3325.097
## sample estimates:
## mean of x 
##  2776.152
t.test(album_list_df$Streaming.On.Demand.Audio...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Streaming.On.Demand.Audio...YTD[which(album_list_df$before_2004==0)],na.rm=T))
## 
##  One Sample t-test
## 
## data:  album_list_df$Streaming.On.Demand.Audio...YTD[which(album_list_df$before_2004 ==     1)]
## t = 3.411, df = 374, p-value = 0.0007178
## alternative hypothesis: true mean is not equal to 50735929
## 95 percent confidence interval:
##  55210963 67392530
## sample estimates:
## mean of x 
##  61301747
#older music is streamed more
t.test(album_list_df$Physical.Albums.Sales...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Physical.Albums.Sales...YTD[which(album_list_df$before_2004==0)],na.rm=T))
## 
##  One Sample t-test
## 
## data:  album_list_df$Physical.Albums.Sales...YTD[which(album_list_df$before_2004 ==     1)]
## t = -16.54, df = 339, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 23064.81
## 95 percent confidence interval:
##   8299.671 11438.276
## sample estimates:
## mean of x 
##  9868.974
t.test(album_list_df$Digital.Song.Sales...YTD[which(album_list_df$before_2004==1)], mu = mean(album_list_df$Digital.Song.Sales...YTD[which(album_list_df$before_2004==0)],na.rm=T))
## 
##  One Sample t-test
## 
## data:  album_list_df$Digital.Song.Sales...YTD[which(album_list_df$before_2004 ==     1)]
## t = 1.0084, df = 375, p-value = 0.3139
## alternative hypothesis: true mean is not equal to 52597.42
## 95 percent confidence interval:
##  50163.78 60155.03
## sample estimates:
## mean of x 
##   55159.4

What (if anything) could be driving that trend?

there are higher counts in streaming than digital or physical sales across the years. This makes sense as more people switch to streaming methods. Recently iTunes has closed meaning that Apple product people users are more likely to stream. There is probably a reluctancy for people to buy digitally on other platforms due to the continued decrease in non streaming sales. # What do you expect holiday music in 2020 to look like ? What do you expect it to look like in 2021? -what artists, albums, record labels do well # Are there any anomalies in the data? anamoly, in june on week 26, there is a random spike that occurs

In the data, old artists have current release years which throws off visualizations

# we notice that around the holiday season, holiday music is listened to than other genres
## anamoly, in june on week 26, there is a random spike that occurs
g = ggplot(data=weekly_holiday_industry, aes(x=as.factor(week),y=Airplay.Audience/Airplay.Spins,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year))  + xlab("Weeks") + ylab("Airplay Interaction Proportion") + ggtitle("Airplay Interaction Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)
# on christmas day, other digital albums from other genres of music are purchased more, otherwise, digital holiday albums are purchased more
## anomaly, why would other music be purchased more on christmas day, maybe because people have more disposable income on that day
g = ggplot(data=daily_holiday_industry, aes(x=day,y=Digital.Albums.Sales/Albums.w.TEA.w.SEA.On.Demand,fill=as.factor(industry))) + geom_col(position='dodge') + facet_wrap(vars(year)) + ylab("Digital Album Sales Proportion") + ggtitle("Proportion of Digital Album Sales Between Holiday and Other Genres") + labs(fill = "Genre")
ggplotly(g)

highlights

# album_list data (total album equivalent consumption audio, album sales, physical album sales, digital album sales, digital song sales, streaming )
album_highlight_2015 = colMeans(album_list_2015[,c(7:12)],na.rm = T)
album_highlight_2016 = colMeans(album_list_2016[,c(7:12)],na.rm = T)
album_highlight_2017 = colMeans(album_list_2017[,c(7:12)],na.rm = T)
album_highlight_2018 = colMeans(album_list_2018[,c(7:12)],na.rm = T)

album_highlight_change = ((album_highlight_2018-album_highlight_2015)/album_highlight_2015)*100 # in percent

# daily holiday highlight on
# "Albums.w.TEA.w.SEA.On.Demand, "Albums.w.TEA.w.SEA.On.Demand.Audio" "Albums.w..TEA" ,"Total.Album.Sales", "Physical.Albums.Sales","Digital.Albums.Sales"               "Digital.Song.Sales","Total.Streaming.On.Demand", "Streaming.On.Demand.Audio","Streaming.On.Demand.Video","Total.Streaming.Programmed","Streaming.Programmed.Audio"         "Streaming.Programmed.Video"         "Airplay.Spins"  "Airplay.Audience"                    

daily_holiday_highlight_2016 = colMeans(daily_holiday_2016[,c(2:16)],na.rm = T)
daily_holiday_highlight_2017 = colMeans(daily_holiday_2017[,c(2:16)],na.rm = T)
daily_holiday_highlight_2018 = colMeans(daily_holiday_2018[,c(2:16)],na.rm = T)
daily_holiday_highlight_2019 = colMeans(daily_holiday_2019[,c(2:16)],na.rm = T)

daily_holiday_highlight_change = ((daily_holiday_highlight_2019-daily_holiday_highlight_2016)/daily_holiday_highlight_2016)*100

# daily industry highlight 
daily_industry_highlight_2016 = colMeans(daily_industry_2016[,c(2:16)],na.rm = T)
daily_industry_highlight_2017 = colMeans(daily_industry_2017[,c(2:16)],na.rm = T)
daily_industry_highlight_2018 = colMeans(daily_industry_2018[,c(2:16)],na.rm = T)
daily_industry_highlight_2019 = colMeans(daily_industry_2019[,c(2:16)],na.rm = T)

daily_industry_highlight_change = ((daily_industry_highlight_2019-daily_industry_highlight_2016)/daily_industry_highlight_2016)*100

# weekly holiday highlight
weekly_holiday_highlight_2015 = colMeans(weekly_holiday_2015[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2016 = colMeans(weekly_holiday_2016[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2017 = colMeans(weekly_holiday_2017[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2018 = colMeans(weekly_holiday_2018[,c(2:16)],na.rm = T)
weekly_holiday_highlight_2019 = colMeans(weekly_holiday_2019[,c(2:16)],na.rm = T)

weekly_holiday_highlight_change = ((weekly_holiday_highlight_2019-weekly_holiday_highlight_2015)/weekly_holiday_highlight_2015) * 100

# weekly industry highlight
weekly_industry_highlight_2015 = colMeans(weekly_industry_2015[,c(2:16)],na.rm = T)
weekly_industry_highlight_2016 = colMeans(weekly_industry_2016[,c(2:16)],na.rm = T)
weekly_industry_highlight_2017 = colMeans(weekly_industry_2017[,c(2:16)],na.rm = T)
weekly_industry_highlight_2018 = colMeans(weekly_industry_2018[,c(2:16)],na.rm = T)
weekly_industry_highlight_2019 = colMeans(weekly_industry_2019[,c(2:16)],na.rm = T)

weekly_industry_highlight_change = ((weekly_industry_highlight_2019-weekly_industry_highlight_2015)/weekly_industry_highlight_2015) * 100

# song list ytd audio 
song_list_2016_audio = mean(song_list_2016$YTD.Audio)
song_list_2019_audio = mean(song_list_2019$YTD.Audio)

song_list_audio_change = ((song_list_2019_audio-song_list_2016_audio)/song_list_2016_audio) * 100

Reflections